#######################################################
### Replication Files for: Pick Your Language       ###
### Published in the Asian Journal of Communication ###
### William O'Brochta                               ###
### Washington University in St. Louis              ###
#######################################################

### Import TOI Data

library(tm)
library(tm.plugin.factiva)

#Set working directory to find HTML files in "TOIHTML" folder
#Pull HTML names
html <- list.files(pattern="\\.(htm|html)$")

#Below code runs very slowly
#Create a Factiva source from HTML files
#for(i in 1:length(html)){
#  assign(paste0('factiva',i), FactivaSource(paste0('Factiva',i,'.html'), format='HTML'))
#}

#Turn Factiva sources created above into copora
#for(i in 1:length(html)){
#  assign(paste0('corpus',i), Corpus(FactivaSource(paste0('Factiva',i,'.html'), format='HTML')))
#}

#Address two errors in the HTML import
#factiva21_new<-FactivaSource('Factiva21_new.html', format='HTML')
#factiva2<-FactivaSource('Factiva2.html', format='HTML')
#corpus1<-Corpus(factiva22, readerControl = list(language=NA))
#corpus2<-Corpus(factiva2, readerControl = list(language=NA))

#Combine individual corpora together into one corpus named "toi"
#Corpus with 7365 articles
toi<-c(corpus1, corpus2, corpus3, corpus4, corpus5, corpus6, corpus7, corpus8, corpus9, corpus10,
       corpus11, corpus12, corpus13, corpus14, corpus15, corpus16, corpus17, corpus18, corpus19, corpus20,
       corpus21, corpus22, corpus23, corpus24, corpus25, corpus26, corpus27, corpus28, corpus29, corpus30,
       corpus31, corpus32, corpus33, corpus34, corpus35, corpus36, corpus37, corpus38, corpus39, corpus40,
       corpus41, corpus42, corpus43, corpus44, corpus45, corpus46, corpus47, corpus48, corpus49, corpus50,
       corpus51, corpus52, corpus53, corpus54, corpus55, corpus56, corpus57, corpus58, corpus59, corpus60,
       corpus61, corpus62, corpus63, corpus64, corpus65, corpus66, corpus67, corpus68, corpus69, corpus70,
       corpus71, corpus72, corpus73, corpus74)

#save(toi, file='toi.RData')




### Sentiment scoring function: this function counts positive and negative words
#Function requires files neg_words.txt and pos_words.txt in working directory
score.sentiment=function(sentences,pos.words,neg.words,.progress='none'){
  require(plyr)
  require(stringr)
  scores = laply(sentences, function(sentence,pos.words,neg.words){
    
    sentence = gsub('[[:punct:]]','',sentence)
    sentence = gsub('[[:cntrl:]]','',sentence)
    sentence = gsub('\\d+','',sentence)
    sentence = tolower(sentence)
    
    word.list = str_split(sentence, '\\s+')
    words = unlist(word.list)
    
    pos.matches = match(words, pos.words)
    neg.matches = match(words, neg.words)
    
    pos.matches = !is.na(pos.matches)
    neg.matches = !is.na(neg.matches)
    
    score = sum(pos.matches)-sum(neg.matches)
    
    return(score)
  }, pos.words,neg.words, .progress=.progress)
  
  scores.df = data.frame(scores=scores, text=sentences)
  return(scores.df)
}

nwords <- function(string, pseudo=F){
  ifelse( pseudo, 
          pattern <- "\\S+", 
          pattern <- "[[:alpha:]]+" 
  )
  str_count(string, pattern)
}

#Import positive and negative words from Hu and Liu.
pos.words <-scan('pos_words.txt',what='character',comment.char=';')
neg.words <-scan('neg_words.txt',what='character',comment.char=';')



### Import data and begin sentiment and word count analysis
library(plyr)
library(dplyr)
library(tm)
library(tidytext)
library(topicmodels)
library(ldatuning)

#Prepare TOI dataset
load('toi.RData')
#Examine the data frame: the date is in column 2, the article title is in column 4,
#the article text is in column 19.
toi_td<-tidy(toi)
toi_td<-toi_td %>% 
  as.data.frame
colnames(toi_td)[c(2,4,19)]<-c('Date', 'Title', 'Text')

#Remove punctuation to standardize titles
toi_td[,4]<-laply(toi_td[,4], function(sentence){
  sentence = gsub('[[:punct:]]','',sentence)
  sentence = gsub('[[:cntrl:]]','',sentence)
  sentence = gsub('\\d+','',sentence)
  sentence = tolower(sentence)
})

#Remove punctuation to standardize text
toi_td[,19]<-laply(toi_td[,19], function(sentence){
  sentence = gsub('[[:punct:]]','',sentence)
  sentence = gsub('[[:cntrl:]]','',sentence)
  sentence = gsub('\\d+','',sentence)
  sentence = tolower(sentence)
})

#Convert date into readable format
toi_td$Date<-as.Date(toi_td$Date,format="%Y-%m-%d")


#Prepare Hindustan dataset
#Article text is in column 5, article title is in column 4
hindustan<-read.csv('data_links_Hindustan_day1.csv', header=T, stringsAsFactors = F)

#Remove punctuation to standardize text
hindustan[,5]<-laply(hindustan[,5], function(sentence){
  sentence = gsub('[[:punct:]]','',sentence)
  sentence = gsub('[[:cntrl:]]','',sentence)
  sentence = gsub('\\d+','',sentence)
  sentence = tolower(sentence)
})

#Remove punctuation to standardize title; encode title in utf8
hindustan[,4]<-enc2utf8(hindustan[,4])
hindustan[,4]<-laply(hindustan[,4], function(sentence){
  sentence = gsub('[[:punct:]]','',sentence)
  sentence = gsub('[[:cntrl:]]','',sentence)
  sentence = gsub('\\d+','',sentence)
  sentence = tolower(sentence)
})

#Convert date into a redable format
hindustan$Date<-as.Date(hindustan$Date,format="%a, %d %b %Y")



### Score sentiment of Hindustan and TOI datasets
#Create TDM from Hindustan dataset
hindustan_freq<-as.matrix(TermDocumentMatrix((Corpus(VectorSource(hindustan[,5])))) )
hindustan_freqMat<-data.frame(ST=rownames(hindustan_freq),Freq=rowSums(hindustan_freq),row.names=NULL)
hindustan_freqMat<-hindustan_freqMat[order(-hindustan_freqMat$Freq),]
hindustan_freqMat$pct<-100*(hindustan_freqMat$Freq/sum(hindustan_freqMat$Freq))
hindustan_freqMat1000<-hindustan_freqMat[1:1000,]

#Scoure sentiment of Hindustan data
hindustan_score <- score.sentiment(hindustan[,5],pos.words,neg.words, .progress='text')
hindustan$Counter<-1
hindustan$score<-hindustan_score$scores
sum(hindustan_score$scores)/nrow(hindustan_score)
hindustan_score_year<-aggregate(score ~ format(Date,"%y"), sum, data = hindustan)
hindustan_score_year<-hindustan_score_year[c(2:9),]
hindustan_score_year$Year<-c(2010:2017)
hindustan_score_year2<-aggregate(Counter ~ format(Date,"%y"), sum, data = hindustan)
hindustan_score_year2<-hindustan_score_year2[c(2:9),]
hindustan_score_year2$Year<-c(2010:2017)

#Count words in Hindustan articles
hindustan$word_count<-lapply(hindustan[,5],nwords)
hindustan$word_count<-as.numeric(hindustan$word_count)
hindustan_words_year<-aggregate(word_count ~ format(Date,"%y"), sum, data = hindustan)
hindustan_words_year<-hindustan_words_year[c(2:9),]
hindustan_words_year$Year<-c(2010:2017)

#Create TDM from TOI data
toi_freq<-as.matrix(TermDocumentMatrix((Corpus(VectorSource(toi_td[,19])))) )
toi_freqMat<-data.frame(ST=rownames(toi_freq),Freq=rowSums(toi_freq),row.names=NULL)
toi_freqMat<-toi_freqMat[order(-toi_freqMat$Freq),]
toi_freqMat$pct<-100*(toi_freqMat$Freq/sum(toi_freqMat$Freq))
toi_freqMat1000<-toi_freqMat[1:1000,]

#Create loop to do the scoring; different approach because of HTML import
toi_score<-as.data.frame(matrix(nrow=nrow(toi_td), ncol=1))
for(i in 1:nrow(toi_td)){
  toi_score[i,1]<-score.sentiment(toi_td[i,19],pos.words,neg.words)$scores
}

#toi_score <- score.sentiment(toi_td[,19],pos.words,neg.words, .progress='text')
toi_td$score<-toi_score$V1
toi_td$Counter<-1
sum(toi_score)/nrow(toi_score)
toi_score_year<-aggregate(score ~ format(Date,"%y"), sum, data = toi_td)
toi_score_year<-toi_score_year[c(2:9),]
toi_score_year$Year<-c(2010:2017)
toi_score_year2<-aggregate(Counter ~ format(Date,"%y"), sum, data = toi_td)
toi_score_year2<-toi_score_year2[c(2:9),]
toi_score_year2$Year<-c(2010:2017)

#Count number of TOI words
toi_words_year<-aggregate(wordcount ~ format(Date,"%y"), sum, data = toi_td)
toi_words_year<-toi_words_year[c(2:9),]
toi_words_year$Year<-c(2010:2017)

toi_hindustan_sentiment<-as.data.frame(cbind(2010:2017,
hindustan_score_year$score/hindustan_score_year2$Counter,
toi_score_year$score/toi_score_year2$Counter))



### Prepare article text and run topic models
#Code will take several hours to run

#Combine Hindustan and TOI together to run a single topic model on both
hindustan$Source<-1
toi_td$Source<-0
hindustanTOI<-rbind(hindustan[,c(2,5,6)], toi_td[,c(2,19,20)])

#Note: the same code is used to run 3 topic models
#uncomment the lines corresponding to which of the three models
#you wish to run.
# 1. combined Hindustan and TOI used in main text.
# 2. Just Hindustan articles
# 3. Just TOI articles

#Both Hindustan and TOI
#docs<-hindustanTOI[,2]

#Just Hindustan
#docs<-hindustan[,5]
#docs<-iconv(docs, "latin1", "ASCII", sub="")

#Just TOI
#docs<-toi_td[,19]
#docs1<-iconv(docs[13], to = "ASCII//TRANSLIT")
#docs<-iconv(docs,to='UTF-8')

#Code common to all three models
docs[docs==""] <- NA
which(is.na(docs))
docs<-docs[complete.cases(docs)]
docs<-Corpus(VectorSource(docs))

#Prepare corpus of articles
#Make lowercase, remove punctuation, remove stopwords
docs<-tm_map(docs, content_transformer(tolower))

toSpace <- content_transformer(function(x, pattern) { return (gsub(pattern, '', x))})
docs<-tm_map(docs, toSpace, '-')
docs<-tm_map(docs, toSpace, "'")
docs<-tm_map(docs, removePunctuation)
docs<-tm_map(docs, removeNumbers)
#mystopwords<-stopwords('english')
#mystopwords<-iconv(mystopwords, to='UTF-8')
docs<-tm_map(docs, removeWords, stopwords('english'))

docs<-tm_map(docs, stripWhitespace)
docs<-tm_map(docs, stemDocument)
myStopwords<-c('timescontentcom', 'reprint', 'bennett', 'coleman')
docs<-tm_map(docs, removeWords, myStopwords)
dtm<-DocumentTermMatrix(docs)
#rownames(dtm)<-hindustan[,4]
freq<-colSums(as.matrix(dtm))
length(freq)
ord<-order(freq, decreasing=T)

#Save the DTM for the appropriate model.
#save(dtm, file='dtm_hindustanTOI.RData')
#save(dtm, file='dtm_toi.RData')
#save(dtm, file='dtm_hindustan.RData')



#Load the code for the appropriate model
#load('dtm_hindustanTOI.RData')
#load('dtm_toi.RData')
#load('dtm_hindustan.RData')

#Common code for all three models
#Set-up parameters for topic model
#Parameters here are from the optimal topic model found using the "ldaTopicsNumber" function below
burnin<-4000
iter<-2000
thin<-500
seed<-list(2003,5,63,100001,765)
nstart<-5
best<-T
k<-14

ldaOut<-LDA(dtm, k, method='Gibbs', 
            control=list(nstart=nstart, seed=seed, 
                         best=best, burnin=burnin, 
                         iter=iter, thin=thin))

ldaOut.topics<-as.matrix(topics(ldaOut))
write.csv(ldaOut.topics,file=paste('LDAGibbs',k,'DocsToTopics.csv'))

ldaOut.terms<-as.matrix(terms(ldaOut,50))
write.csv(ldaOut.terms,file=paste('LDAGibbs',k,'TopicsToTerms.csv'))

topicProbabilities<-as.data.frame(ldaOut@gamma)
write.csv(topicProbabilities,file=paste('LDAGibbs',k,'TopicProbabilities.csv'))

topic1ToTopic2 <- lapply(1:nrow(dtm),function(x)
  sort(topicProbabilities[x,])[k]/sort(topicProbabilities[x,])[k-1])

topic2ToTopic3 <- lapply(1:nrow(dtm),function(x)
  sort(topicProbabilities[x,])[k-1]/sort(topicProbabilities[x,])[k-2])

write.csv(topic1ToTopic2,file=paste('LDAGibbs',k,'Topic1ToTopic2.csv'))
write.csv(topic2ToTopic3,file=paste('LDAGibbs',k,'Topic2ToTopic3.csv'))

#Function for finding the optimal number of topics
system.time(
  ldaTopicsNumber<-FindTopicsNumber(dtm, topics=seq(2,20, by=2),  mc.cores= 4L,
                                    metrics = c( "CaoJuan2009", "Arun2010", "Deveaud2014"),
                                    control=list(nstart=nstart, seed=seed, 
                                                 best=best, burnin=burnin, 
                                                 iter=iter, thin=thin), verbose=T))


#Display number of topics in a plot. SI interprets this plot in the text.
FindTopicsNumber_plot(ldaTopicsNumber)


#Save the results for the appropriate model.
#hindustanTOItopics<-cbind(hindustanTOI, topicProbabilities)
#save(hindustanTOItopics, file='hindustanTOItopics.RData')

#hindustantopics<-cbind(hindustan, topicProbabilities)
#save(hindustantopics, file='hindustantopics.RData')

#TOItopics<-cbind(toi, topicProbabilities)
#save(TOItopics, file='TOItopics.RData')




###Learn about the topics for overall model combining Hindustan and TOI articles
library(lubridate)

load('hindustanTOItopics.RData')
hindustanTOItopics$Year<-year(hindustanTOItopics$Date)
hindustanTOItopics$Counter<-1
hindustanTOItopics2<-hindustanTOItopics[,c(3:19)]

#Group article topic percentages by year
hindustanTOIYear<-as.data.frame(hindustanTOItopics2 %>%
  group_by(Year,Source) %>%
  summarise_all(funs(sum), na.rm=T))

#Estimate total number of articles per year using "the" for TOI and "of" for Hindustan
hindustanTOIYear$TotalArticles<-c(121635,23750,129435,55000,365319,50000,
                                  487454,75000,521069,75000,460228,68750,
                                  392429,37500,415959,118750,432202,462500,
                                  175704,225000)
hindustanTOIYear<-hindustanTOIYear[c(3:18),]
rownames(hindustanTOIYear) <- NULL

hindustanTOIMean<-hindustanTOIYear
hindustanTOIMean[,c(3:16)]<-hindustanTOIMean[,c(3:16)]/hindustanTOIMean$Counter



### Figure 2 
#TOI=0, Hindustan=1
#Initially ran plots for each of the 14 topics.
#Found significantly different topics for 1, 2, and 12.
dev.off()
par(mar=c(5.1, 4.6, 4.1, 2.1))
plot(V1~Year, hindustanTOIMean[hindustanTOIMean$Source==0,], type='o', ylim=c(0,0.25), main='Police Control', ylab='Percentage of Articles',
     cex.axis=2, cex.lab=2, lwd=3, cex.main=2, cex=2)
par(new=T)
plot(V1~Year, hindustanTOIMean[hindustanTOIMean$Source==1,], type='o', lty=2, pch=18, col='red', ylim=c(0,0.25), xlab='', ylab='', axes=F,
     cex=2, lwd=3)
legend(2014,0.25, legend=c('TOI', 'Hindustan'), col=c('black', 'red'), pch=c(1,18), lty=c(1,2), bty='n', lwd=3, cex=2)

plot(V2~Year, hindustanTOIMean[hindustanTOIMean$Source==0,], type='o', ylim=c(0,0.25), main='Official Statements', ylab='Percentage of Articles',
     cex.axis=2, cex.lab=2, lwd=3, cex.main=2, cex=2)
par(new=T)
plot(V2~Year, hindustanTOIMean[hindustanTOIMean$Source==1,], type='o', lty=2, pch=18, col='red', ylim=c(0,0.25), xlab='', ylab='', axes=F,
     cex=2, lwd=3)
legend(2014,0.25, legend=c('TOI', 'Hindustan'), col=c('black', 'red'), pch=c(1,18), lty=c(1,2), bty='n', lwd=3, cex=2)

plot(V12~Year, hindustanTOIMean[hindustanTOIMean$Source==0,], type='o', ylim=c(0,0.25), main='Political Parties', ylab='Percentage of Articles',
     cex.axis=2, cex.lab=2, lwd=3, cex.main=2, cex=2)
par(new=T)
plot(V12~Year, hindustanTOIMean[hindustanTOIMean$Source==1,], type='o', lty=2, pch=18, col='red', ylim=c(0,0.25), xlab='', ylab='', axes=F,
     cex=2, lwd=3)
legend(2014,0.25, legend=c('TOI', 'Hindustan'), col=c('black', 'red'), pch=c(1,18), lty=c(1,2), bty='n', lwd=3, cex=2)


source0<-hindustanTOIMean[hindustanTOIMean$Source==0,]
source1<-hindustanTOIMean[hindustanTOIMean$Source==1,]


### Table 1: Average Topic Distributions
#Topics are arranged: Police control, official statements, world without riots,
#description of riot events, police arrests, riots as a metaphor, riots in film/music,
#communal issues, court statements, women and children, sikhs, political parties,
#riots in food, government report 

#Compare overall differences
overallpct<-as.data.frame(cbind(colMeans(source0[,c(3:16)]), colMeans(source1[,c(3:16)])))
colnames(overallpct)<-c('TOI', 'Hindustan')
#overallpct$diff<-overallpct$TOI-overallpct$Hindustan
overallpct$TOI
overallpct$Hindustan

#Find maximum absolute differences
sourcecombined<-cbind(source0,source1)
sourcecombined2<-sourcecombined[,c(3:16)]-sourcecombined[,c(21:34)]
maxdiff<-NA
for(i in 1:14){maxdiff[i]<-max(abs(sourcecombined2[,i]))}
maxdiff



###Table SI.3.1: Correlations
TOIHindustanCor<-NA
for(i in 1:14){TOIHindustanCor[i]<-cor(hindustanTOIMean[hindustanTOIMean$Source==0,2+i],
                             hindustanTOIMean[hindustanTOIMean$Source==1,2+i])}
TOIHindustanCor




### Figure 1: Frequency of Riot Reporting
#Import NCRB riot data
NCR<-read.csv('NCRreport.csv', header=T, stringsAsFactors = F)

hindustanTOIMean$Pct<-(hindustanTOIMean$Counter/hindustanTOIMean$TotalArticles)*100

dev.off()
par(mar=c(5.1, 4.2, 4.1, 4.2))
plot(Pct~Year, hindustanTOIMean[hindustanTOIMean$Source==0,], type='o', ylim=c(0,1),
     xlab='Year', ylab='Percent of Articles', main='Newspaper and NCRB Riot Reports', cex=1.5, cex.main=1.5, cex.axis=1.5, cex.lab=1.5, lwd=3)
par(new=T)
plot(Pct~Year, hindustanTOIMean[hindustanTOIMean$Source==1,], type='o', lty=2, pch=18, col='red', ylim=c(0,1), xlab='', ylab='', axes=F,
     cex=2, lwd=3)
par(new=T)
plot(Riots~Year, NCR[NCR$State=='Total',], type='o', lty=3, 
     col='blue', pch=16, xlab='', ylab='', axes=F, xlim=c(2010,2017), cex=2, lwd=3)
axis(side=4, cex=1.5, cex.axis=1.5)
mtext(side=4, line=3, 'Reported Riots', cex=1.5)
legend(2014,75000, legend=c('TOI', 'Hindustan', 'NCRB'), 
       lty=c(1,2,3), col=c('black', 'red', 'blue'), pch=c(1,18,16), cex=1.5, bty='n', lwd=3)

mean(hindustanTOIMean[hindustanTOIMean$Source==0,]$Pct)
mean(hindustanTOIMean[hindustanTOIMean$Source==1,]$Pct)

#Correlations at the bottom of the Figure
cor(hindustanTOIMean[hindustanTOIMean$Source==1,]$Pct, hindustanTOIMean[hindustanTOIMean$Source==0,]$Pct)
cor(hindustanTOIMean[hindustanTOIMean$Source==0 & hindustanTOIMean$Year!=2017,]$Pct, NCR[NCR$State=='Total',]$Riots)
cor(hindustanTOIMean[hindustanTOIMean$Source==1 & hindustanTOIMean$Year!=2017,]$Pct, NCR[NCR$State=='Total',]$Riots)



### Prepare to examine city and communal mentions in articles
#Count number of certain words
#All states and union provinces and 25 largest cities with at least one hit
load('dtm_hindustanTOI.RData')
placenames<-as.matrix(dtm[,c("uttar", 'maharashtra', 'bihar', 'bengal', 'madhya', 'nadu',
                    'rajasthan', 'karnataka', 'gujarat', 'andhra', 'odisha',
                    'telangana', 'kerala', 'jharkhand', 'assam', 'punjab',
                    'chhattisgarh', 'haryana', 'jammu', 'uttarakhand',
                    'tripura', 'meghalaya', 'manipur', 'nagaland', 'goa',
                    'mizoram', 'sikkim', 'mumbai', 'delhi', 'hyderabad',
                    'ahmedabad', 'chennai', 'kolkata', 'surat', 'pune',
                    'jaipur', 'lucknow', 'kanpur', 'nagpur', 'visakhapatnam',
                    'thane', 'bhopal', 'chinchwad', 'patna', 'vadodara',
                    'ghaziabad', 'ludhiana', 'agra', 'madurai', 'hindu', 
                    'muslim', 'communal', 'victim')])
sort(colSums(placenames), decreasing = T)
hindustanTOItopics3<-cbind(hindustanTOItopics, placenames)

#Map largest cities to states
hindustanTOItopics3$bihar_complete<-hindustanTOItopics3$bihar+hindustanTOItopics3$patna
hindustanTOItopics3$up_complete<-hindustanTOItopics3$uttar+hindustanTOItopics3$lucknow+
  hindustanTOItopics3$kanpur+hindustanTOItopics3$ghaziabad+hindustanTOItopics3$agra
hindustanTOItopics3$maharashtra_complete<-hindustanTOItopics3$maharashtra+hindustanTOItopics3$mumbai+
  hindustanTOItopics3$pune+hindustanTOItopics3$nagpur+hindustanTOItopics3$thane+
  hindustanTOItopics3$chinchwad
hindustanTOItopics3$gujarat_complete<-hindustanTOItopics3$gujarat+hindustanTOItopics3$ahmedabad+
  hindustanTOItopics3$surat+hindustanTOItopics3$vadodara

#Take city/state mentions and collapse by year
hindustanTOItopics4<-hindustanTOItopics3[,c(3,18:76)]
hindustanTOIYear2<-as.data.frame(hindustanTOItopics4 %>%
                                   group_by(Year,Source) %>%
                                   summarise_all(funs(sum), na.rm=T))
hindustanTOIYear2<-hindustanTOIYear2[c(3:18),]
rownames(hindustanTOIYear2) <- NULL

hindustanTOIMean2<-hindustanTOIYear2
hindustanTOIMean2[,c(4:60)]<-hindustanTOIMean2[,c(4:60)]/hindustanTOIMean2$Counter
#hindustanTOIMean2<-hindustanTOIMean2[c(1:16),]

#Create variables for whether an article mentions Hindu, Muslim, or Communal
hindustanTOItopics4$hindu01<-ifelse(hindustanTOItopics4$hindu==0,0,1)
hindustanTOItopics4$muslim01<-ifelse(hindustanTOItopics4$muslim==0,0,1)
hindustanTOItopics4$communal01<-ifelse(hindustanTOItopics4$communal==0,0,1)
hindustanTOItopics4$hinducommunal<-ifelse(hindustanTOItopics4$communal01==1 & hindustanTOItopics4$hindu01==1, 1, 0)
hindustanTOItopics4$muslimcommunal<-ifelse(hindustanTOItopics4$communal01==1 & hindustanTOItopics4$muslim01==1, 1, 0)
hindustanTOItopics4$hindumuslim<-ifelse(hindustanTOItopics4$muslim01==1 & hindustanTOItopics4$hindu01==1, 1, 0)
hindustanTOItopics4$victim01<-ifelse(hindustanTOItopics4$victim!=0, 1, 0)
hindustanTOItopics4$victimhindu<-ifelse(hindustanTOItopics4$victim0==1 & hindustanTOItopics4$hindu01==1, 1, 0)
hindustanTOItopics4$victimmuslim<-ifelse(hindustanTOItopics4$victim0==1 & hindustanTOItopics4$muslim01==1, 1, 0)
hindustanTOItopics4$victimcommunal<-ifelse(hindustanTOItopics4$victim0==1 & hindustanTOItopics4$communal01==1, 1, 0)



### Table 3: Hindu-Muslim Riots
#Split out articles by source
hindustanTOItopics4source0<-hindustanTOItopics4[hindustanTOItopics4$Source==0,]
hindustanTOItopics4source1<-hindustanTOItopics4[hindustanTOItopics4$Source==1,]

#Mentions per article
colSums(hindustanTOItopics4source0[,c(53:60)])/nrow(hindustanTOItopics4source0)
colSums(hindustanTOItopics4source1[,c(53:60)])/nrow(hindustanTOItopics4source1)

muslim01<-hindustanTOItopics4[hindustanTOItopics4$Source==0 & hindustanTOItopics4$muslim!=0,]
muslim00<-hindustanTOItopics4[hindustanTOItopics4$Source==0 & hindustanTOItopics4$muslim==0,]
muslim11<-hindustanTOItopics4[hindustanTOItopics4$Source==1 & hindustanTOItopics4$muslim!=0,]
muslim10<-hindustanTOItopics4[hindustanTOItopics4$Source==1 & hindustanTOItopics4$muslim==0,]

#Each of the following calculations refer to rows and columns in Table 3
#Sentiment and percentage of articles
#TOI and Muslim
muslim01_score <- score.sentiment(hindustanTOItopics[rownames(muslim01),2],pos.words,neg.words, .progress='text')
muslim01$Counter<-1
muslim01$score<-muslim01_score$scores
sum(muslim01_score$scores)/nrow(muslim01_score)
nrow(muslim01)/nrow(hindustanTOItopics4source0)

#Hindustan Muslim
muslim11_score <- score.sentiment(hindustanTOItopics[rownames(muslim11),2],pos.words,neg.words, .progress='text')
muslim11$Counter<-1
muslim11$score<-muslim11_score$scores
sum(muslim11_score$scores)/nrow(muslim11_score)
nrow(muslim11)/nrow(hindustanTOItopics4source1)

#Hindu
hindu01<-hindustanTOItopics4[hindustanTOItopics4$Source==0 & hindustanTOItopics4$hindu!=0,]
hindu00<-hindustanTOItopics4[hindustanTOItopics4$Source==0 & hindustanTOItopics4$hindu==0,]
hindu11<-hindustanTOItopics4[hindustanTOItopics4$Source==1 & hindustanTOItopics4$hindu!=0,]
hindu10<-hindustanTOItopics4[hindustanTOItopics4$Source==1 & hindustanTOItopics4$hindu==0,]

#TOI and Hindu
hindu01_score <- score.sentiment(hindustanTOItopics[rownames(hindu01),2],pos.words,neg.words, .progress='text')
hindu01$Counter<-1
hindu01$score<-hindu01_score$scores
sum(hindu01_score$scores)/nrow(hindu01_score)
nrow(hindu01)/nrow(hindustanTOItopics4source0)

#Hindustan Hindu
hindu11_score <- score.sentiment(hindustanTOItopics[rownames(hindu11),2],pos.words,neg.words, .progress='text')
hindu11$Counter<-1
hindu11$score<-hindu11_score$scores
sum(hindu11_score$scores)/nrow(hindu11_score)
nrow(hindu11)/nrow(hindustanTOItopics4source1)


communal01<-hindustanTOItopics4[hindustanTOItopics4$Source==0 & hindustanTOItopics4$communal!=0,]
communal00<-hindustanTOItopics4[hindustanTOItopics4$Source==0 & hindustanTOItopics4$communal==0,]
communal11<-hindustanTOItopics4[hindustanTOItopics4$Source==1 & hindustanTOItopics4$communal!=0,]
communal10<-hindustanTOItopics4[hindustanTOItopics4$Source==1 & hindustanTOItopics4$communal==0,]

#TOI and communal
communal01_score <- score.sentiment(hindustanTOItopics[rownames(communal01),2],pos.words,neg.words, .progress='text')
communal01$Counter<-1
communal01$score<-communal01_score$scores
sum(communal01_score$scores)/nrow(communal01_score)
nrow(communal01)/nrow(hindustanTOItopics4source0)

#Hindustan and communal
communal11_score <- score.sentiment(hindustanTOItopics[rownames(communal11),2],pos.words,neg.words, .progress='text')
communal11$Counter<-1
communal11$score<-communal11_score$scores
sum(communal11_score$scores)/nrow(communal11_score)
nrow(communal11)/nrow(hindustanTOItopics4source1)

#Hindustan Overall
hindustan_overall <- score.sentiment(hindustanTOItopics[hindustanTOItopics$Source==1,2],pos.words,neg.words, .progress='text')
hindustan_overall$Counter<-1
hindustan_overall$score<-hindustan_overall$scores
sum(hindustan_overall$scores)/nrow(hindustan_overall)

#TOI Overall
toi_overall <- score.sentiment(hindustanTOItopics[hindustanTOItopics$Source==0,2],pos.words,neg.words, .progress='text')
toi_overall$Counter<-1
toi_overall$score<-toi_overall$scores
sum(toi_overall$scores)/nrow(toi_overall)





#Above 1000: delhi, gujarat, uttar, bihar, mumbai, lucknow, ahmedabad
#uttar, bihar
#delhi because many mentions, but few riots
#seems that this is because delhi is a site for commissions, central government
#discussions about riots
hindustanTOIMean2source0<-hindustanTOIMean2[hindustanTOIMean2$Source==0,]
hindustanTOIMean2source1<-hindustanTOIMean2[hindustanTOIMean2$Source==1,]




### Table 2: Mentions of States and Cities
#Sum of TOI mentions of places
#For TOI
a<-colMeans(hindustanTOIMean2source0[,c(4:52)])
sum(a)

#For Hindustan
b<-colMeans(hindustanTOIMean2source1[,c(4:52)])
sum(b)

mean(hindustanTOIMean2source0$bihar_complete)
mean(hindustanTOIMean2source1$bihar_complete)

mean(hindustanTOIMean2source0$up_complete)
mean(hindustanTOIMean2source1$up_complete)

mean(hindustanTOIMean2source0$gujarat_complete)
mean(hindustanTOIMean2source1$gujarat_complete)

mean(hindustanTOIMean2source0$jharkhand)
mean(hindustanTOIMean2source1$jharkhand)

mean(hindustanTOIMean2source0$uttarakhand)
mean(hindustanTOIMean2source1$uttarakhand)

mean(hindustanTOIMean2source0$delhi)
mean(hindustanTOIMean2source1$delhi)

mean(hindustanTOIMean2source0$maharashtra_complete)
mean(hindustanTOIMean2source1$maharashtra_complete)

mean(hindustanTOIMean2source0$karnataka)
mean(hindustanTOIMean2source1$karnataka)



### Table SI.3.1: Mutual Information
library(entropy)
hindustanTOIMeansource0<-hindustanTOIMean[hindustanTOIMean$Source==0,]
hindustanTOIMeansource1<-hindustanTOIMean[hindustanTOIMean$Source==1,]

#Corresponding mutual information, doesn't need to sum to one
mi.plugin(freqs2d, unit='log2')

mutualinfo<-NA
for(i in 1:14){
  freqs2d = rbind(hindustanTOIMeansource0[,2+i], hindustanTOIMeansource1[,2+i])
  mutualinfo[i]<-mi.plugin(freqs2d, unit='log2')}
mutualinfo

cor_entropy<-as.data.frame(cbind(abs(TOIHindustanCor),mutualinfo))
cor_entropy


